{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dates and Timestamps\n",
    "\n",
    "You will often find yourself working with Time and Date information, let's walk through some ways you can deal with it!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "# May take a little while on a local computer\n",
    "spark = SparkSession.builder.appName(\"dates\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = spark.read.csv(\"appl_stock.csv\",header=True,inferSchema=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
      "|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|\n",
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
      "|2010-01-04 00:00:...|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|\n",
      "|2010-01-05 00:00:...|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|\n",
      "|2010-01-06 00:00:...|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|\n",
      "|2010-01-07 00:00:...|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|\n",
      "|2010-01-08 00:00:...|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|\n",
      "|2010-01-11 00:00:...|212.79999700000002|        213.000002|        208.450005|210.11000299999998|115557400|         27.221758|\n",
      "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002|        206.419998|        207.720001|148614900|          26.91211|\n",
      "|2010-01-13 00:00:...|        207.870005|210.92999500000002|        204.099998|        210.650002|151473000|          27.29172|\n",
      "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002|        209.020004|            209.43|108223500|         27.133657|\n",
      "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003|        205.869999|            205.93|148516900|26.680197999999997|\n",
      "|2010-01-19 00:00:...|        208.330002|215.18999900000003|        207.240004|        215.039995|182501900|27.860484999999997|\n",
      "|2010-01-20 00:00:...|        214.910006|        215.549994|        209.500002|            211.73|153038200|         27.431644|\n",
      "|2010-01-21 00:00:...|        212.079994|213.30999599999998|        207.210003|        208.069996|152038600|         26.957455|\n",
      "|2010-01-22 00:00:...|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|\n",
      "|2010-01-25 00:00:...|202.51000200000001|        204.699999|        200.190002|        203.070002|266424900|26.309658000000002|\n",
      "|2010-01-26 00:00:...|205.95000100000001|        213.710005|        202.580004|        205.940001|466777500|         26.681494|\n",
      "|2010-01-27 00:00:...|        206.849995|            210.58|        199.530001|        207.880005|430642100|26.932840000000002|\n",
      "|2010-01-28 00:00:...|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|\n",
      "|2010-01-29 00:00:...|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|\n",
      "|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|\n",
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's walk through how to grab parts of the timestamp data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------+\n",
      "|dayofmonth(Date)|\n",
      "+----------------+\n",
      "|               4|\n",
      "|               5|\n",
      "|               6|\n",
      "|               7|\n",
      "|               8|\n",
      "|              11|\n",
      "|              12|\n",
      "|              13|\n",
      "|              14|\n",
      "|              15|\n",
      "|              19|\n",
      "|              20|\n",
      "|              21|\n",
      "|              22|\n",
      "|              25|\n",
      "|              26|\n",
      "|              27|\n",
      "|              28|\n",
      "|              29|\n",
      "|               1|\n",
      "+----------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(dayofmonth(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+\n",
      "|hour(Date)|\n",
      "+----------+\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "|         0|\n",
      "+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(hour(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+\n",
      "|dayofyear(Date)|\n",
      "+---------------+\n",
      "|              4|\n",
      "|              5|\n",
      "|              6|\n",
      "|              7|\n",
      "|              8|\n",
      "|             11|\n",
      "|             12|\n",
      "|             13|\n",
      "|             14|\n",
      "|             15|\n",
      "|             19|\n",
      "|             20|\n",
      "|             21|\n",
      "|             22|\n",
      "|             25|\n",
      "|             26|\n",
      "|             27|\n",
      "|             28|\n",
      "|             29|\n",
      "|             32|\n",
      "+---------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(dayofyear(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+\n",
      "|month(Date)|\n",
      "+-----------+\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          1|\n",
      "|          2|\n",
      "+-----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(month(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+\n",
      "|year(Date)|\n",
      "+----------+\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "|      2010|\n",
      "+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(year(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
      "|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|\n",
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
      "|2010-01-04 00:00:...|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|\n",
      "|2010-01-05 00:00:...|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|\n",
      "|2010-01-06 00:00:...|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|\n",
      "|2010-01-07 00:00:...|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|\n",
      "|2010-01-08 00:00:...|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|\n",
      "|2010-01-11 00:00:...|212.79999700000002|        213.000002|        208.450005|210.11000299999998|115557400|         27.221758|2010|\n",
      "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002|        206.419998|        207.720001|148614900|          26.91211|2010|\n",
      "|2010-01-13 00:00:...|        207.870005|210.92999500000002|        204.099998|        210.650002|151473000|          27.29172|2010|\n",
      "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002|        209.020004|            209.43|108223500|         27.133657|2010|\n",
      "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003|        205.869999|            205.93|148516900|26.680197999999997|2010|\n",
      "|2010-01-19 00:00:...|        208.330002|215.18999900000003|        207.240004|        215.039995|182501900|27.860484999999997|2010|\n",
      "|2010-01-20 00:00:...|        214.910006|        215.549994|        209.500002|            211.73|153038200|         27.431644|2010|\n",
      "|2010-01-21 00:00:...|        212.079994|213.30999599999998|        207.210003|        208.069996|152038600|         26.957455|2010|\n",
      "|2010-01-22 00:00:...|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|2010|\n",
      "|2010-01-25 00:00:...|202.51000200000001|        204.699999|        200.190002|        203.070002|266424900|26.309658000000002|2010|\n",
      "|2010-01-26 00:00:...|205.95000100000001|        213.710005|        202.580004|        205.940001|466777500|         26.681494|2010|\n",
      "|2010-01-27 00:00:...|        206.849995|            210.58|        199.530001|        207.880005|430642100|26.932840000000002|2010|\n",
      "|2010-01-28 00:00:...|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|2010|\n",
      "|2010-01-29 00:00:...|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|2010|\n",
      "|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|2010|\n",
      "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.withColumn(\"Year\",year(df['Date'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------+------------------+\n",
      "|avg(Year)|        avg(Close)|\n",
      "+---------+------------------+\n",
      "|   2015.0|120.03999980555547|\n",
      "|   2013.0| 472.6348802857143|\n",
      "|   2014.0| 295.4023416507935|\n",
      "|   2012.0| 576.0497195640002|\n",
      "|   2016.0|104.60400786904763|\n",
      "|   2010.0| 259.8424600000002|\n",
      "|   2011.0|364.00432532142867|\n",
      "+---------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "newdf = df.withColumn(\"Year\",year(df['Date']))\n",
    "newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']].show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Still not quite presentable! Let's use the .alias method as well as round() to clean this up!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----------+\n",
      "|  Year|Mean Close|\n",
      "+------+----------+\n",
      "|2015.0|    120.04|\n",
      "|2013.0|    472.63|\n",
      "|2014.0|    295.40|\n",
      "|2012.0|    576.05|\n",
      "|2016.0|    104.60|\n",
      "|2010.0|    259.84|\n",
      "|2011.0|    364.00|\n",
      "+------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "result = newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']]\n",
    "result = result.withColumnRenamed(\"avg(Year)\",\"Year\")\n",
    "result = result.select('Year',format_number('avg(Close)',2).alias(\"Mean Close\")).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Perfect! Now you know how to work with Date and Timestamp information!"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
